package com.egao.common.component.crawler.gongkao;

import cn.hutool.core.date.DateTime;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson.JSON;
import com.egao.common.module.cms.entity.Official;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.Date;
import java.util.List;

/**
 * 公考网公务员信息爬虫结果处理实现类
 * @author pyx
 */
public class GongKaoPipeline implements Pipeline {

    private List<Official> list;

    private Spider oschinaSpider;

    public GongKaoPipeline() {

    }

    public GongKaoPipeline(List<Official> list) {
        this.list = list;
    }

    @Override
    public void process(ResultItems resultItems, Task task) {
        // 标题为空不处理
        if (StrUtil.isBlank(resultItems.get("title"))) {
            return;
        }

        // 举办时间为空不处理
        if (StrUtil.isBlank(resultItems.get("officialTime"))) {
            return;
        }

        // 完善举办时间
        resultItems.put("officialTime", resultItems.get("officialTime") + ":00");

        // 将结果字符串转换为结果对象
        Official official = JSON.parseObject(JSON.toJSONString(resultItems.getAll()), Official.class);

        //判断不爬取昨天之前的数据
        Date yesterday = DateUtil.yesterday();
        if(official.getOfficialTime().compareTo(yesterday) == -1){
            oschinaSpider.stop();
            return;
        }

        // 内容中去掉海投网功能按键内容
        String content = official.getContent();
        content = content.replaceAll("http://url.cn/5tyfX6Y", "");
        content = content.replace("\"http://www.chinagwy.org/www/images/app_banner.jpg?v=20171107\"", "");
        official.setContent(content);

        list.add(official);

    }


    public Spider getOschinaSpider() {
        return oschinaSpider;
    }

    public void setOschinaSpider(Spider oschinaSpider) {
        this.oschinaSpider = oschinaSpider;
    }
}
